load("covariates.RData")
load("Disease.RData")
dat <- merge(x = full_data, y = disease[c("SEQN", "bp", "bi", "cd")],
by = "SEQN", all = TRUE)
dat <- dat[complete.cases(dat), ]
dat$ncd <- dat$bp == 1 | dat$bi == 1 | dat$cd == 1
dat <- dat[dat$ALQ151 != "Don't know", ]
dat <- dat[dat$DBQ700 != "Don't know", ]
dat$ALQ151 <- factor(dat$ALQ151)
dat$SLQ050 <- factor(dat$SLQ050)
dat$DBQ700 <- factor(dat$DBQ700)
dat$DBQ700 = relevel(dat$DBQ700, ref = "Excellent")
dat$ALQ151 = relevel(dat$ALQ151, ref = "Yes")
dat$SLQ050 = relevel(dat$SLQ050, ref = "Yes")
head(dat)
dat$ncd_p <- factor(dat$ncd, levels = c("TRUE", "FALSE", "P"), labels = c("TRUE", "FALSE", "P-value"))
rndr <- function(x, name, ...){
if (length(x) == 0){
y <- dat[[name]]
s <- rep("", length(render.default(x = y, name = name)))
if (is.numeric(y)){
p <- t.test(y ~ dat$ncd_p)$p.value
} else{
p <- chisq.test(table(y, droplevels(dat$ncd_p)))$p.value
}
s[2] <- sub("<", "<", format.pval(p, digits = 3, eps = 0.001))
s
} else{
render.default(x = x, name = name, ...)
}
}
rndr.strat <- function(label, n, ...){
ifelse(n == 0, label, render.strat.default(label, n, ...))
}
table1(~ DBQ700 + ALQ120Q + ALQ151 + DR1TVC + DR1TCAFF + SLD012 + SLQ050 +
SMD030 + WHD020 + bmi| ncd_p, data = dat, droplevels = F,
render = rndr, render.strat = rndr.strat, overall = F)
| TRUE (n=833) |
FALSE (n=301) |
P-value | |
|---|---|---|---|
| DBQ700 | |||
| Excellent | 76 (9.1%) | 38 (12.6%) | 0.00578 |
| Very good | 156 (18.7%) | 74 (24.6%) | |
| Good | 319 (38.3%) | 119 (39.5%) | |
| Fair | 212 (25.5%) | 53 (17.6%) | |
| Poor | 70 (8.4%) | 17 (5.6%) | |
| ALQ120Q | |||
| Mean (SD) | 2.85 (13.4) | 3.09 (4.49) | 0.648 |
| Median [Min, Max] | 1.00 [0.00, 365] | 2.00 [0.00, 60.0] | |
| ALQ151 | |||
| Yes | 266 (31.9%) | 71 (23.6%) | 0.00825 |
| No | 567 (68.1%) | 230 (76.4%) | |
| DR1TVC | |||
| Mean (SD) | 70.7 (80.7) | 75.2 (94.6) | 0.465 |
| Median [Min, Max] | 41.7 [0.00, 921] | 42.8 [0.00, 745] | |
| DR1TCAFF | |||
| Mean (SD) | 207 (258) | 232 (360) | 0.264 |
| Median [Min, Max] | 144 [0.00, 2950] | 156 [0.00, 4530] | |
| SLD012 | |||
| Mean (SD) | 7.64 (1.68) | 7.57 (1.38) | 0.488 |
| Median [Min, Max] | 8.00 [2.00, 13.5] | 7.50 [3.00, 11.5] | |
| SLQ050 | |||
| Yes | 341 (40.9%) | 69 (22.9%) | <0.001 |
| No | 492 (59.1%) | 232 (77.1%) | |
| SMD030 | |||
| Mean (SD) | 18.2 (7.15) | 18.8 (6.60) | 0.251 |
| Median [Min, Max] | 17.0 [0.00, 58.0] | 18.0 [0.00, 54.0] | |
| WHD020 | |||
| Mean (SD) | 189 (43.9) | 171 (35.2) | <0.001 |
| Median [Min, Max] | 182 [78.0, 386] | 165 [101, 300] | |
| bmi | |||
| Mean (SD) | 28.7 (7.29) | 26.2 (5.99) | <0.001 |
| Median [Min, Max] | 28.2 [0.000956, 64.2] | 25.8 [0.00105, 53.7] |
Continuous Variable: ALQ120Q, DR1TVC, DR1TCAFF, SLD012, SMD030, WHD020, bmi
(Draw Boxplots for the continuous variables)
ggboxplot(data = dat, x = "ncd", y = "ALQ120Q", add = "jitter", color = "ncd") +
theme(legend.position = "none") +
xlab("Having one or more diseases") +
ylab("How often drink alcohol over past 12 mos") +
stat_compare_means(method = "anova", label.x.npc = "center", label.y.npc = "top")
ggboxplot(data = dat, x = "ncd", y = "DR1TVC", add = "jitter", color = "ncd")+
theme(legend.position = "none") +
xlab("Having one or more diseases") +
ylab("Vitamin C (mg)") +
stat_compare_means(method = "anova", label.x.npc = "center", label.y.npc = "top")
ggboxplot(data = dat, x = "ncd", y = "DR1TCAFF", add = "jitter", color = "ncd")+
theme(legend.position = "none") +
xlab("Having one or more diseases") +
ylab("Caffeine (mg)") +
stat_compare_means(method = "anova", label.x.npc = "center", label.y.npc = "top")
ggboxplot(data = dat, x = "ncd", y = "SLD012", add = "jitter", color = "ncd")+
theme(legend.position = "none") +
xlab("Having one or more diseases") +
ylab("Sleep hours") +
stat_compare_means(method = "anova", label.x.npc = "center", label.y.npc = "top")
ggboxplot(data = dat, x = "ncd", y = "SMD030", add = "jitter", color = "ncd")+
theme(legend.position = "none") +
xlab("Having one or more diseases") +
ylab("Age started smoking cigarettes regularly") +
stat_compare_means(method = "anova", label.x.npc = "center", label.y.npc = "top")
ggboxplot(data = dat, x = "ncd", y = "WHD020", add = "jitter", color = "ncd")+
theme(legend.position = "none") +
xlab("Having one or more diseases") +
ylab("Current self-reported weight (pounds)") +
stat_compare_means(method = "anova", label.x.npc = "center", label.y.npc = "top")
ggboxplot(data = dat, x = "ncd", y = "bmi", add = "jitter", color = "ncd")+
theme(legend.position = "none") +
xlab("Having one or more diseases") +
ylab("BMI") +
stat_compare_means(method = "anova", label.x.npc = "center", label.y.npc = "top")
Discrete variable DBQ700, ALQ151, SLQ050, smok_yn (Draw Mosaic plots for discrete variables)
pval <- function(p){
if (p < 0.001) {
return("< 0.001")
} else {return(round(p, 3))}
}
ggplot(data = dat) +
geom_mosaic(aes(x = product(DBQ700, ncd), fill = DBQ700)) +
xlab(label = "Having one or more diseases") +
ylab(label = "Diet") +
theme(legend.position = "none") +
geom_text(x = .8, y = .8,
label = paste0("Chisq p ",
pval(chisq.test(dat$ncd, dat$DBQ700, correct = F)$p.value)))
ggplot(data = dat) +
geom_mosaic(aes(x = product(ALQ151, ncd), fill = ALQ151)) +
xlab(label = "Having one or more diseases") +
ylab(label = "Alcohol") +
theme(legend.position = "none") +
geom_text(x = .8, y = .8,
label = paste0("Chisq p ",
pval(chisq.test(dat$ncd, dat$ALQ151, correct = F)$p.value)))
ggplot(data = dat) +
geom_mosaic(aes(x = product(SLQ050, ncd), fill = SLQ050)) +
xlab(label = "Having one or more diseases") +
ylab(label = "Having trouble sleeping") +
theme(legend.position = "none") +
geom_text(x = .8, y = .8,
label = paste0("Chisq p ",
pval(chisq.test(dat$ncd, dat$SLQ050, correct = F)$p.value)))
Continuous Variable: ALQ120Q, DR1TVC, DR1TCAFF, SLD012, SMD030, WHD020, bmi
(Use two sample t-test with two-sided 0.05 significance level to compare the mean level of each variable between two NCDs outcomes)
t.test(dat$ncd,dat$ALQ120Q)
##
## Welch Two Sample t-test
##
## data: dat$ncd and dat$ALQ120Q
## t = -6.2825, df = 1136.2, p-value = 4.736e-10
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -2.861839 -1.499713
## sample estimates:
## mean of x mean of y
## 0.7345679 2.9153439
t.test(dat$ncd,dat$DR1TVC)
##
## Welch Two Sample t-test
##
## data: dat$ncd and dat$DR1TVC
## t = -28.327, df = 1133.1, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -76.05595 -66.20261
## sample estimates:
## mean of x mean of y
## 0.7345679 71.8638448
t.test(dat$ncd,dat$DR1TCAFF)
##
## Welch Two Sample t-test
##
## data: dat$ncd and dat$DR1TCAFF
## t = -24.809, df = 1133, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -229.6009 -195.9459
## sample estimates:
## mean of x mean of y
## 0.7345679 213.5079365
t.test(dat$ncd,dat$SLD012)
##
## Welch Two Sample t-test
##
## data: dat$ncd and dat$SLD012
## t = -139.12, df = 1303.2, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -6.984244 -6.790006
## sample estimates:
## mean of x mean of y
## 0.7345679 7.6216931
t.test(dat$ncd,dat$SMD030)
##
## Welch Two Sample t-test
##
## data: dat$ncd and dat$SMD030
## t = -84.556, df = 1142, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -18.04232 -17.22400
## sample estimates:
## mean of x mean of y
## 0.7345679 18.3677249
t.test(dat$ncd,dat$WHD020)
##
## Welch Two Sample t-test
##
## data: dat$ncd and dat$WHD020
## t = -145.1, df = 1133.2, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -185.6006 -180.6481
## sample estimates:
## mean of x mean of y
## 0.7345679 183.8589065
t.test(dat$ncd,dat$bmi)
##
## Welch Two Sample t-test
##
## data: dat$ncd and dat$bmi
## t = -130.06, df = 1141.9, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -27.71861 -26.89471
## sample estimates:
## mean of x mean of y
## 0.7345679 28.0412255
Discrete variable DBQ700, ALQ151, SLQ050, smok_yn (Use Chi-square tests to compare the proportions of diagnosed with NCDs for each discrete variable)
chisq.test(dat$ncd,dat$DBQ700)
##
## Pearson's Chi-squared test
##
## data: dat$ncd and dat$DBQ700
## X-squared = 14.531, df = 4, p-value = 0.00578
chisq.test(dat$ncd,dat$ALQ151)
##
## Pearson's Chi-squared test with Yates' continuity correction
##
## data: dat$ncd and dat$ALQ151
## X-squared = 6.9775, df = 1, p-value = 0.008254
chisq.test(dat$ncd,dat$SLQ050)
##
## Pearson's Chi-squared test with Yates' continuity correction
##
## data: dat$ncd and dat$SLQ050
## X-squared = 30.303, df = 1, p-value = 3.695e-08
# Univariate logistic regression
colnames(dat) <- c("SEQN","Diet","Alcohol_freq","Alcohol",
"Vitamin_C","Caffeine","Sleep","Trouble_sleep",
"Smoking_age","smok_yn","Weight","BMI","bp","bi","cd","ncd","ncd_p")
univ = function(var)
{
form = formula(paste0("ncd ~ ", var))
logit = glm(form, data = dat, family = "binomial")
p = coef(summary(logit))[,4]
OR = data.frame(round(exp(cbind(OR = coef(logit), confint(logit))), 3),
p_value = format.pval(p, digits = 3, eps = 0.001))[-1,]
colnames(OR) <- c("OR", "2.5%", "97.5%", "p_value")
return(OR)
}
varlist = colnames(dat)[!(colnames(dat) %in% c("SEQN", "bp", "bi", "cd","ncd","smok_yn","ncd_p"))]
table_uni <- do.call(rbind.data.frame, lapply(varlist, univ))
knitr::kable(table_uni)
| OR | 2.5% | 97.5% | p_value | |
|---|---|---|---|---|
| DietVery good | 1.054 | 0.650 | 1.694 | 0.82898 |
| DietGood | 1.340 | 0.855 | 2.077 | 0.19466 |
| DietFair | 2.000 | 1.219 | 3.269 | 0.00578 |
| DietPoor | 2.059 | 1.080 | 4.050 | 0.03138 |
| Alcohol_freq | 0.998 | 0.987 | 1.013 | 0.761 |
| AlcoholNo | 0.658 | 0.483 | 0.888 | 0.00685 |
| Vitamin_C | 0.999 | 0.998 | 1.001 | 0.432 |
| Caffeine | 1.000 | 0.999 | 1.000 | 0.2 |
| Sleep | 1.027 | 0.946 | 1.115 | 0.5267 |
| Trouble_sleepNo | 0.429 | 0.315 | 0.578 | <0.001 |
| Smoking_age | 0.990 | 0.972 | 1.008 | 0.269 |
| Weight | 1.012 | 1.008 | 1.015 | < 0.001 |
| BMI | 1.054 | 1.034 | 1.076 | <0.001 |
form <- formula(paste0("ncd ~" , paste(varlist, collapse = "+")))
full <- glm(form, data = dat, family = "binomial")
final <- step(full, direction = "both", trace = 0)
#summary(final)
p <- coef(summary(final))[,4]
sum_final <- data.frame(round(exp(cbind(OR = coef(final), confint(final))), 4),
p_value = format.pval(p, digits = 3, eps = 0.001))
## Waiting for profiling to be done...
colnames(sum_final) <- c("OR", "2.5%", "97.5%", "P-val")
rownames(sum_final)[1] <- "DietExcellent"
sum_final[1,]<-c("Ref","","","")
## Warning in `[<-.factor`(`*tmp*`, iseq, value = ""): invalid factor level, NA
## generated
knitr::kable(sum_final)
| OR | 2.5% | 97.5% | P-val | |
|---|---|---|---|---|
| DietExcellent | Ref | NA | ||
| DietVery good | 1.0064 | 0.6111 | 1.6442 | 0.9799 |
| DietGood | 1.2011 | 0.7547 | 1.8903 | 0.4330 |
| DietFair | 1.7663 | 1.0597 | 2.9321 | 0.0281 |
| DietPoor | 1.5279 | 0.779 | 3.0809 | 0.2249 |
| AlcoholNo | 0.7081 | 0.5142 | 0.9671 | 0.0320 |
| Caffeine | 0.9995 | 0.999 | 1 | 0.0546 |
| Trouble_sleepNo | 0.4385 | 0.3193 | 0.5963 | <0.001 |
| Weight | 1.0116 | 1.0079 | 1.0156 | <0.001 |
## Effect displays
plot(allEffects(final))
rocplot <- function(truth, pred, ...){
predob = prediction(pred, truth)
perf = performance(predob, "tpr", "fpr")
plot(perf, ...)
area = auc(truth, pred)
area = format(round(area, 4), nsmall = 4)
text(x = .8, y = .1, labels = paste("AUC = ", area))
segments(x0 = 0, y0 = 0, x1 = 1, y1 = 1, col = "gray", lty = 2)
}
rocplot(dat$ncd, final$fitted.values)
## Setting levels: control = FALSE, case = TRUE
## Setting direction: controls < cases
dat2 <- dat %>%
mutate(ncd = bp + bi + cd)
dat2$ncd <- factor(dat2$ncd)
dat2 <- dat2 %>%
dplyr::select(-smok_yn, -Weight)
head(dat2)
dat2$ncd_p <- factor(dat2$ncd, levels = 0:4, labels = c(0:3, "P-value"))
rndr2 <- function(x, name, ...){
if (length(x) == 0){
y <- dat2[[name]]
s <- rep("", length(render.default(x = y, name = name)))
if (is.numeric(y)){
p <- kruskal.test(y ~ dat2$ncd_p)$p.value
} else{
p <- chisq.test(table(y, droplevels(dat2$ncd_p)))$p.value
}
s[2] <- sub("<", "<", format.pval(p, digits = 3, eps = 0.001))
s
} else{
render.default(x = x, name = name, ...)
}
}
rndr.strat <- function(label, n, ...){
ifelse(n == 0, label, render.strat.default(label, n, ...))
}
table1(~ Diet+Alcohol+Vitamin_C+Caffeine+Sleep+Trouble_sleep+
Smoking_age + BMI| ncd_p, data = dat2, droplevels = F,
render = rndr2, render.strat = rndr.strat, overall = F)
| 0 (n=301) |
1 (n=418) |
2 (n=347) |
3 (n=68) |
P-value | |
|---|---|---|---|---|---|
| Diet | |||||
| Excellent | 38 (12.6%) | 47 (11.2%) | 26 (7.5%) | 3 (4.4%) | 0.00186 |
| Very good | 74 (24.6%) | 87 (20.8%) | 54 (15.6%) | 15 (22.1%) | |
| Good | 119 (39.5%) | 166 (39.7%) | 128 (36.9%) | 25 (36.8%) | |
| Fair | 53 (17.6%) | 89 (21.3%) | 105 (30.3%) | 18 (26.5%) | |
| Poor | 17 (5.6%) | 29 (6.9%) | 34 (9.8%) | 7 (10.3%) | |
| Alcohol | |||||
| Yes | 71 (23.6%) | 121 (28.9%) | 116 (33.4%) | 29 (42.6%) | 0.0041 |
| No | 230 (76.4%) | 297 (71.1%) | 231 (66.6%) | 39 (57.4%) | |
| Vitamin_C | |||||
| Mean (SD) | 75.2 (94.6) | 67.3 (83.1) | 76.2 (79.6) | 63.4 (69.4) | 0.133 |
| Median [Min, Max] | 42.8 [0.00, 745] | 39.9 [0.00, 921] | 49.8 [0.00, 545] | 33.0 [0.400, 331] | |
| Caffeine | |||||
| Mean (SD) | 232 (360) | 221 (277) | 184 (224) | 236 (293) | 0.0911 |
| Median [Min, Max] | 156 [0.00, 4530] | 160 [0.00, 2950] | 132 [0.00, 1920] | 153 [0.00, 1760] | |
| Sleep | |||||
| Mean (SD) | 7.57 (1.38) | 7.74 (1.66) | 7.53 (1.68) | 7.60 (1.82) | 0.413 |
| Median [Min, Max] | 7.50 [3.00, 11.5] | 8.00 [2.50, 13.5] | 7.50 [2.00, 13.0] | 8.00 [2.00, 11.0] | |
| Trouble_sleep | |||||
| Yes | 69 (22.9%) | 147 (35.2%) | 155 (44.7%) | 39 (57.4%) | <0.001 |
| No | 232 (77.1%) | 271 (64.8%) | 192 (55.3%) | 29 (42.6%) | |
| Smoking_age | |||||
| Mean (SD) | 18.8 (6.60) | 18.4 (6.69) | 18.1 (7.70) | 17.6 (7.03) | 0.0115 |
| Median [Min, Max] | 18.0 [0.00, 54.0] | 18.0 [0.00, 49.0] | 17.0 [0.00, 58.0] | 16.0 [0.00, 45.0] | |
| BMI | |||||
| Mean (SD) | 26.2 (5.99) | 27.8 (6.63) | 29.0 (7.45) | 32.7 (8.89) | <0.001 |
| Median [Min, Max] | 25.8 [0.00105, 53.7] | 27.5 [0.000956, 55.3] | 28.3 [0.000984, 57.2] | 32.4 [0.00102, 64.2] |
Continuous Variable: Alcohol_freq, Vitamin_C, Caffeine, Sleep, Smoking_age, BMI (Draw Boxplots for the continuous variables)
ggboxplot(data = dat2, x = "ncd", y = "Alcohol_freq", add = "jitter", color = "ncd") +
theme(legend.position = "none") +
xlab("Having one or more diseases") +
ylab("How often drink alcohol over past 12 mos") +
stat_compare_means(method = "anova", label.x.npc = "center", label.y.npc = "top")
ggboxplot(data = dat2, x = "ncd", y = "Vitamin_C", add = "jitter", color = "ncd")+
theme(legend.position = "none") +
xlab("Having one or more diseases") +
ylab("Vitamin C (mg)") +
stat_compare_means(method = "anova", label.x.npc = "center", label.y.npc = "top")
ggboxplot(data = dat2, x = "ncd", y = "Caffeine", add = "jitter", color = "ncd")+
theme(legend.position = "none") +
xlab("Having one or more diseases") +
ylab("Caffeine (mg)") +
stat_compare_means(method = "anova", label.x.npc = "center", label.y.npc = "top")
ggboxplot(data = dat2, x = "ncd", y = "Sleep", add = "jitter", color = "ncd")+
theme(legend.position = "none") +
xlab("Having one or more diseases") +
ylab("Sleep hours") +
stat_compare_means(method = "anova", label.x.npc = "center", label.y.npc = "top")
ggboxplot(data = dat2, x = "ncd", y = "Smoking_age", add = "jitter", color = "ncd")+
theme(legend.position = "none") +
xlab("Having one or more diseases") +
ylab("Age started smoking cigarettes regularly") +
stat_compare_means(method = "anova", label.x.npc = "center", label.y.npc = "top")
# ggboxplot(data = dat2, x = "ncd", y = "Weight", add = "jitter", color = "ncd")+
# theme(legend.position = "none") +
# xlab("Having one or more diseases") +
# ylab("Current self-reported weight (pounds)") +
# stat_compare_means(method = "anova", label.x.npc = "center", label.y.npc = "top")
ggboxplot(data = dat2, x = "ncd", y = "BMI", add = "jitter", color = "ncd")+
theme(legend.position = "none") +
xlab("Having one or more diseases") +
ylab("BMI") +
stat_compare_means(method = "anova", label.x.npc = "center", label.y.npc = "top")
Discrete variable Diet, Alcohol, Trouble_sleep (Draw Mosaic plots for discrete variables)
pval <- function(p){
if (p < 0.001) {
return("< 0.001")
} else {return(round(p, 3))}
}
ggplot(data = dat2) +
geom_mosaic(aes(x = product(Diet, ncd), fill = Diet)) +
xlab(label = "Having one or more diseases") +
ylab(label = "Diet") +
theme(legend.position = "none") +
geom_text(x = .8, y = .8,
label = paste0("Chisq p ",
pval(chisq.test(dat$ncd, dat$DBQ700, correct = F)$p.value)))
ggplot(data = dat2) +
geom_mosaic(aes(x = product(Alcohol, ncd), fill = Alcohol)) +
xlab(label = "Having one or more diseases") +
ylab(label = "Alcohol") +
theme(legend.position = "none") +
geom_text(x = .8, y = .8,
label = paste0("Chisq p ",
pval(chisq.test(dat$ncd, dat$ALQ151, correct = F)$p.value)))
ggplot(data = dat2) +
geom_mosaic(aes(x = product(Trouble_sleep, ncd), fill = Trouble_sleep)) +
xlab(label = "Having one or more diseases") +
ylab(label = "Having trouble sleeping") +
theme(legend.position = "none") +
geom_text(x = .8, y = .8,
label = paste0("Chisq p ",
pval(chisq.test(dat$ncd, dat$SLQ050, correct = F)$p.value)))
Continuous Variable: ALQ120Q, DR1TVC, DR1TCAFF, SLD012, SMD030, WHD020, bmi
(Use two sample t-test with two-sided 0.05 significance level to compare the mean level of each variable between two NCDs outcomes)
kruskal.test(dat2$ncd,dat2$Alcohol_freq)
##
## Kruskal-Wallis rank sum test
##
## data: dat2$ncd and dat2$Alcohol_freq
## Kruskal-Wallis chi-squared = 38.221, df = 19, p-value = 0.005563
kruskal.test(dat2$ncd,dat2$Vitamin_C)
##
## Kruskal-Wallis rank sum test
##
## data: dat2$ncd and dat2$Vitamin_C
## Kruskal-Wallis chi-squared = 787.8, df = 795, p-value = 0.5652
kruskal.test(dat2$ncd,dat2$Caffeine)
##
## Kruskal-Wallis rank sum test
##
## data: dat2$ncd and dat2$Caffeine
## Kruskal-Wallis chi-squared = 411.78, df = 437, p-value = 0.8016
kruskal.test(dat2$ncd,dat2$Sleep)
##
## Kruskal-Wallis rank sum test
##
## data: dat2$ncd and dat2$Sleep
## Kruskal-Wallis chi-squared = 24.654, df = 23, p-value = 0.3684
kruskal.test(dat2$ncd,dat2$Smoking_age)
##
## Kruskal-Wallis rank sum test
##
## data: dat2$ncd and dat2$Smoking_age
## Kruskal-Wallis chi-squared = 71.148, df = 43, p-value = 0.004433
# kruskal.test(dat2$ncd,dat2$Weight)
kruskal.test(dat2$ncd,dat2$BMI)
##
## Kruskal-Wallis rank sum test
##
## data: dat2$ncd and dat2$BMI
## Kruskal-Wallis chi-squared = 757.46, df = 713, p-value = 0.1207
Discrete variable DBQ700, ALQ151, SLQ050, smok_yn (Use Chi-square tests to compare the proportions of diagnosed with NCDs for each discrete variable)
chisq.test(dat2$ncd,dat2$Diet)
##
## Pearson's Chi-squared test
##
## data: dat2$ncd and dat2$Diet
## X-squared = 31.165, df = 12, p-value = 0.001859
chisq.test(dat2$ncd,dat2$Alcohol)
##
## Pearson's Chi-squared test
##
## data: dat2$ncd and dat2$Alcohol
## X-squared = 13.265, df = 3, p-value = 0.004098
chisq.test(dat2$ncd,dat2$Trouble_sleep)
##
## Pearson's Chi-squared test
##
## data: dat2$ncd and dat2$Trouble_sleep
## X-squared = 47.138, df = 3, p-value = 3.248e-10
# Check the Proportional odds assumption for each variable
varlist2 <- colnames(dat2)[!(colnames(dat2) %in% c("SEQN", "bp", "bi", "cd", "ncd", "ncd_p"))]
prop_test <- data.frame()
for (var in varlist2){
form = formula(paste0("ncd ~ ", var))
logit = clm(form, data = dat2)
a = scale_test(logit)$`Pr(>Chi)`[2]
b = nominal_test(logit)$`Pr(>Chi)`[2]
OR = data.frame(round(exp(cbind(OR = coef(logit)[4], confint(logit))), 3))
p1 <- cbind(OR, p_val = coef(summary(logit))[-1:-3, 4] < 0.05,
satisfy_normality = a > 0.05,
satisfy_effect = b > 0.05)
prop_test <- rbind(prop_test, p1)
}
## Warning: (2) Model is nearly unidentifiable: very large eigenvalue
## - Rescale variables?
## In addition: Absolute and relative convergence criteria were met
prop_test
All other variables are violated the PH assumption.
comby <- function(y, j){
ans <- data.frame()
for (i in 1:3){
ans <- rbind(ans, y[,,i][j, ])
}
ans
}
comb <- function(x, y){
ans <- data.frame()
i = 3
for (i in 2:ncol(x)){
ans1 <- cbind(x[, i], comby(y, i))
colnames(ans1) <- c("OR", "2.5%", "97.5%")
rownames(ans1) <- paste0(colnames(x)[i], 1:3)
ans <- rbind(ans, ans1)
}
ans
}
univ <- function(var){
form = formula(paste0("ncd ~ ", var))
multi = multinom(form, data = dat2)
comb(exp(coef(multi)), exp(confint(multi)))
}
t<- round(do.call(rbind.data.frame, lapply(varlist2, univ)), 3)
## # weights: 24 (15 variable)
## initial value 1572.057806
## iter 10 value 1424.423662
## iter 20 value 1402.831276
## final value 1402.828975
## converged
## # weights: 12 (6 variable)
## initial value 1572.057806
## iter 10 value 1414.736437
## iter 20 value 1408.522221
## final value 1408.519204
## converged
## # weights: 12 (6 variable)
## initial value 1572.057806
## iter 10 value 1412.141794
## final value 1412.126076
## converged
## # weights: 12 (6 variable)
## initial value 1572.057806
## iter 10 value 1416.988933
## final value 1416.987734
## converged
## # weights: 12 (6 variable)
## initial value 1572.057806
## iter 10 value 1419.551744
## iter 20 value 1415.542862
## final value 1415.527198
## converged
## # weights: 12 (6 variable)
## initial value 1572.057806
## iter 10 value 1416.931480
## final value 1416.930635
## converged
## # weights: 12 (6 variable)
## initial value 1572.057806
## iter 10 value 1394.893024
## final value 1394.748706
## converged
## # weights: 12 (6 variable)
## initial value 1572.057806
## iter 10 value 1417.527116
## final value 1417.522425
## converged
## # weights: 12 (6 variable)
## initial value 1572.057806
## iter 10 value 1389.613448
## final value 1389.501830
## converged
knitr::kable(t)
| OR | 2.5% | 97.5% | |
|---|---|---|---|
| DietVery good1 | 0.951 | 0.561 | 1.612 |
| DietVery good2 | 1.067 | 0.580 | 1.963 |
| DietVery good3 | 2.568 | 0.700 | 9.420 |
| DietGood1 | 1.128 | 0.692 | 1.838 |
| DietGood2 | 1.572 | 0.900 | 2.746 |
| DietGood3 | 2.661 | 0.761 | 9.307 |
| DietFair1 | 1.358 | 0.786 | 2.345 |
| DietFair2 | 2.896 | 1.592 | 5.267 |
| DietFair3 | 4.302 | 1.183 | 15.649 |
| DietPoor1 | 1.379 | 0.661 | 2.878 |
| DietPoor2 | 2.923 | 1.358 | 6.292 |
| DietPoor3 | 5.216 | 1.201 | 22.649 |
| Alcohol_freq1 | 1.004 | 0.991 | 1.017 |
| Alcohol_freq2 | 0.926 | 0.882 | 0.972 |
| Alcohol_freq3 | 0.868 | 0.776 | 0.972 |
| AlcoholNo1 | 0.758 | 0.539 | 1.064 |
| AlcoholNo2 | 0.615 | 0.434 | 0.870 |
| AlcoholNo3 | 0.415 | 0.240 | 0.719 |
| Vitamin_C1 | 0.999 | 0.997 | 1.001 |
| Vitamin_C2 | 1.000 | 0.998 | 1.002 |
| Vitamin_C3 | 0.998 | 0.995 | 1.002 |
| Caffeine1 | 1.000 | 0.999 | 1.000 |
| Caffeine2 | 0.999 | 0.999 | 1.000 |
| Caffeine3 | 1.000 | 0.999 | 1.001 |
| Sleep1 | 1.066 | 0.972 | 1.170 |
| Sleep2 | 0.985 | 0.895 | 1.084 |
| Sleep3 | 1.012 | 0.859 | 1.193 |
| Trouble_sleepNo1 | 0.548 | 0.392 | 0.767 |
| Trouble_sleepNo2 | 0.368 | 0.262 | 0.519 |
| Trouble_sleepNo3 | 0.221 | 0.128 | 0.384 |
| Smoking_age1 | 0.994 | 0.974 | 1.015 |
| Smoking_age2 | 0.986 | 0.965 | 1.008 |
| Smoking_age3 | 0.977 | 0.939 | 1.016 |
| BMI1 | 1.036 | 1.013 | 1.059 |
| BMI2 | 1.065 | 1.039 | 1.091 |
| BMI3 | 1.141 | 1.100 | 1.184 |
form2 <- formula(paste0("ncd ~" , paste(varlist2, collapse = "+")))
full2 <- clm(ncd ~ Diet + Alcohol + Vitamin_C + Caffeine +
Sleep + Trouble_sleep + Smoking_age + BMI, data = dat2)
final2 <- step(full2, direction = "both", trace = 0)
sum_final2 <- round(cbind(exp(coef(final2)[4:12]), exp(confint(final2))),3)
colnames(sum_final2)[1] <- "OR"
DietExcellent<-c("Ref","","")
sum_final2<-rbind(DietExcellent,sum_final2)
knitr::kable(sum_final2)
| OR | 2.5 % | 97.5 % | |
|---|---|---|---|
| DietExcellent | Ref | ||
| DietVery good | 1.148 | 0.76 | 1.738 |
| DietGood | 1.372 | 0.94 | 2.006 |
| DietFair | 2.005 | 1.338 | 3.01 |
| DietPoor | 1.759 | 1.041 | 2.975 |
| AlcoholNo | 0.682 | 0.538 | 0.865 |
| Caffeine | 1 | 0.999 | 1 |
| Trouble_sleepNo | 0.489 | 0.389 | 0.614 |
| Smoking_age | 0.988 | 0.972 | 1.003 |
| BMI | 1.054 | 1.037 | 1.072 |
# FYI, Interaction: Not significant
## Effect displays
plot(allEffects(full2))
multiclass.roc(dat2$ncd, final2$fitted.values)
## Setting direction: controls < cases
## Setting direction: controls < cases
## Setting direction: controls > cases
## Setting direction: controls > cases
## Setting direction: controls > cases
## Setting direction: controls > cases
##
## Call:
## multiclass.roc.default(response = dat2$ncd, predictor = final2$fitted.values)
##
## Data: final2$fitted.values with 4 levels of dat2$ncd: 0, 1, 2, 3.
## Multi-class area under the curve: 0.8054